In [37]:
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
Out[37]:
In [1]:
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
First, we need to load the entire NYTimes article corpus and load the sentiment analysis routine we created (from sentiwordnet.py).
In [8]:
DATA_DIR = "./data/"
In [9]:
all_data_list = []
for year in range(1990,2017):
data = pd.read_csv(DATA_DIR + '{}_Output.csv'.format(year), header=None, encoding="utf-8")
all_data_list.append(data) # list of dataframes
data = pd.concat(all_data_list, axis=0)
data.columns = ['id','date','headline', 'lead']
# Drop dupes and fix dates
data = data.drop_duplicates("id").reset_index(drop=True)
data.date = pd.to_datetime(data.date)
# Fill null articles
data.loc[data.lead.isnull(), "lead"] = ""
In [3]:
# Can be found in sentiwordnet.py, or also the SentiWordNet notebook.
# If it complains about missing corpuses, try:
# nltk.download("sentiwordnet")
# nltk.download("stopwords")
import sentiwordnet
We'll subset the data and process it year by year because it is computationally expensive, so we don't want to have to repeat the whole process year by year, and we might also want the intermediate results later.
In [ ]:
import datetime
years = list(reversed(data.date.dt.year.unique()))
years = range(1990, 2017)
for year in years:
print(year, datetime.datetime.now())
data_subset = data[data.date.dt.year == year].set_index("id")
sentiments_subset = pd.DataFrame(
map(sentiwordnet.get_sentiment_2, data_subset.lead.values),
columns=["score", "count"]
)
sentiments_subset.index = data_subset.index.copy()
sentiments_subset.to_csv("./sentiments_{}.csv".format(year))
We can then load these up again later, and take a look at the format:
In [5]:
years = range(1990, 2017)
sentiments = []
for year in years:
#print(year)
sentiment = pd.read_csv("./sentiments_{}.csv".format(year))
sentiments.append(sentiment)
sentiments = pd.concat(sentiments)
sentiments.head()
Out[5]:
In [10]:
data_subset = data[data.date.dt.year.isin(years)].set_index("id")
data_subset = data_subset.join(sentiments.set_index("id"))
We can also view some histograms to see the distribution of the score values and sentiment-word counts:
In [18]:
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
data_subset["score"].hist(bins=100, ax=ax[0])
data_subset["count"].hist(bins=100, ax=ax[1])
plt.title("Distribution of scores and sentiment word counts across all articles")
Out[18]:
As we can see, the scores are very nicely distributed around 0. Some examples of positive sentiment articles:
In [19]:
data_subset.nlargest(15, "score")
Out[19]:
Some examples of negative snetiment articles:
In [20]:
data_subset.nsmallest(15, "score")
Out[20]:
Now, we can aggregate sentiment data up to days:
In [23]:
daily_sentiment = data_subset[["date", "score", "count"]].set_index("date").resample("D").sum()
daily_sentiment["avg_score"] = daily_sentiment["score"] / daily_sentiment["count"]
daily_sentiment.head()
Out[23]:
And save the file:
In [64]:
daily_sentiment.to_csv("./daily_sentiment_1990_2016.csv")
Now, we can also reduce the data to only the first two weeks of each month, to see if we can later use this as a predictor for the CCI, which gets released at the beginning of every month:
In [24]:
first_two = daily_sentiment[daily_sentiment.index.day < 15]
In [25]:
first_two.head()
Out[25]:
In [34]:
plt.figure(figsize=(15,5))
plt.title("Sentiment scores from 1990-2016")
first_two.avg_score.resample("MS").mean().plot(label="First two weeks of the month")
daily_sentiment.avg_score.resample("MS").mean().plot(label="Monthly score")
plt.legend()
Out[34]:
And from there we can aggregate up to the monthly range, to match our CCI indicator:
In [35]:
monthly_sentiment = data_subset[["date", "score", "count"]].set_index("date").resample("MS").sum()
monthly_sentiment["avg_score"] = monthly_sentiment["score"] / monthly_sentiment["count"]
monthly_sentiment.head()
Out[35]:
In [ ]:
daily_sentiment.to_csv("./monthly_sentiment_1990_2016.csv")